library(latexpdf)
library(stringr)
library(ggplot2)
library(gridExtra)
library(faraway)
library(MASS)
library(olsrr)
library(Metrics)
library(caret)
library(corrplot)
About the Data
First, we will read the white wine data and look at the fundamentals of the data.
white.wine = read.csv("winequality-white.csv", header=TRUE, sep=";")
head(white.wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.0 0.27 0.36 20.7 0.045
## 2 6.3 0.30 0.34 1.6 0.049
## 3 8.1 0.28 0.40 6.9 0.050
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.40 6.9 0.050
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
names(white.wine)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
dim(white.wine)
## [1] 4898 12
sapply(white.wine, class)
## fixed.acidity volatile.acidity citric.acid
## "numeric" "numeric" "numeric"
## residual.sugar chlorides free.sulfur.dioxide
## "numeric" "numeric" "numeric"
## total.sulfur.dioxide density pH
## "numeric" "numeric" "numeric"
## sulphates alcohol quality
## "numeric" "numeric" "integer"
white.info = summary(white.wine)
white.info
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 2.00 Min. : 9.0 Min. :0.9871
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0 1st Qu.:0.9917
## Median :0.04300 Median : 34.00 Median :134.0 Median :0.9937
## Mean :0.04577 Mean : 35.31 Mean :138.4 Mean :0.9940
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0 3rd Qu.:0.9961
## Max. :0.34600 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.180 Median :0.4700 Median :10.40 Median :6.000
## Mean :3.188 Mean :0.4898 Mean :10.51 Mean :5.878
## 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 3rd Qu.:6.000
## Max. :3.820 Max. :1.0800 Max. :14.20 Max. :9.000
Include some basic descriptions including the class and the descriptive statistics of the variables. + Missing value
Data Visualization - Histograms
First, we will generate histograms for the all the variables. This will tell us the approximate distribution of each variables.
white.hist_y = ggplot(aes(quality), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 1) +
ggtitle('Histogram of Quality') + theme(plot.title = element_text(size=9))
white.hist_x1 = ggplot(aes(fixed.acidity), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.5) +
ggtitle('Histogram of Fixed Acidity') + theme(plot.title = element_text(size=8))
white.hist_x2 = ggplot(aes(volatile.acidity), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.1) +
ggtitle('Histogram of Volatile Acidity') + theme(plot.title = element_text(size=7.5))
white.hist_x3 = ggplot(aes(citric.acid), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.1) +
ggtitle('Histogram of Citric Acid') + theme(plot.title = element_text(size=8))
white.hist_x4 = ggplot(aes(residual.sugar), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 5) +
ggtitle('Histogram of Residual Sugar') + theme(plot.title = element_text(size=6.7))
white.hist_x5 = ggplot(aes(chlorides), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.05) +
ggtitle('Histogram of Chlorides') + theme(plot.title = element_text(size=8.5))
white.hist_x6 = ggplot(aes(free.sulfur.dioxide), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 10.4) +
ggtitle('Histogram of Free Sulfur Dioxide') + theme(plot.title = element_text(size=6))
white.hist_x7 = ggplot(aes(total.sulfur.dioxide), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 16) +
ggtitle('Histogram of Total Sulfur Dioxide') + theme(plot.title = element_text(size=6))
white.hist_x8 = ggplot(aes(density), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.004) +
ggtitle('Histogram of Density') + theme(plot.title = element_text(size=9))
white.hist_x9 = ggplot(aes(pH), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.08) +
ggtitle('Histogram of pH') + theme(plot.title = element_text(size=9))
white.hist_x10 = ggplot(aes(sulphates), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.1) +
ggtitle('Histogram of Sulphates') + theme(plot.title = element_text(size=8))
white.hist_x11 = ggplot(aes(alcohol), data = white.wine) +
geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.3) +
ggtitle('Histogram of Alcohol') + theme(plot.title = element_text(size=9))
grid.arrange(white.hist_y, white.hist_x1, white.hist_x2, white.hist_x3, white.hist_x4, white.hist_x5, white.hist_x6, white.hist_x7, white.hist_x8, white.hist_x9, white.hist_x10, white.hist_x11, ncol = 4, top="Histograms for White Wine Data")
There are a couple of things that we can see from the histograms above.
Data Visualization - Regressors vs. Predictor
Now, we will plot each of the regressors against the predictor. This will help us see the approximate relationship between each varibles and wine quality.
white.group_x1 = ggplot(aes(fixed.acidity, quality), data = white.wine) +
ggtitle("Fixed Acidity vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))
white.group_x2 = ggplot(aes(volatile.acidity, quality), data = white.wine) +
ggtitle("Volatile Acidity vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=8.5))
white.group_x3 = ggplot(aes(citric.acid, quality), data = white.wine) +
ggtitle("Citric Acid vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))
white.group_x4 = ggplot(aes(residual.sugar, quality), data = white.wine) +
ggtitle("Residual Sugar vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=8))
white.group_x5 = ggplot(aes(chlorides, quality), data = white.wine) +
ggtitle("Chlorides vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))
white.group_x6 = ggplot(aes(free.sulfur.dioxide, quality), data = white.wine) +
ggtitle("Free Sulfur Dioxide vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=7))
white.group_x7 = ggplot(aes(total.sulfur.dioxide, quality), data = white.wine) +
ggtitle("Total Sulfur Dioxide vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=6.5))
white.group_x8 = ggplot(aes(density, quality), data = white.wine) +
ggtitle("Density vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))
white.group_x9 = ggplot(aes(pH, quality), data = white.wine) +
ggtitle("pH vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))
white.group_x10 = ggplot(aes(sulphates, quality), data = white.wine) +
ggtitle("Sulphates vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))
white.group_x11 = ggplot(aes(alcohol, quality), data = white.wine) +
ggtitle("Alcohol vs Quality") +
geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +
geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))
grid.arrange(white.hist_y, white.group_x1, white.group_x2, white.group_x3, white.group_x4, white.group_x5, white.group_x6, white.group_x7, white.group_x8, white.group_x9, white.group_x10, white.group_x11, ncol = 4)
There are a couple of things that we can see from the plots above.
Data Visualization - Boxplots
Now, we will generate boxplots of each variable and see more closely of the distribution of variables.
par(mfrow = c(3,4))
boxplot(white.wine$quality, col="slategray2", pch=19, xlab = "quality")
boxplot(white.wine$fixed.acidity, col="slategray2", pch=19, xlab = "Fixed Acidity")
boxplot(white.wine$volatile.acidity, col="slategray2", pch=19, xlab = "Volatile Acidity")
boxplot(white.wine$citric.acid, col="slategray2", pch=19, xlab = "Citric Acid")
boxplot(white.wine$residual.sugar, col="slategray2", pch=19, xlab = "Residual Sugar")
boxplot(white.wine$chlorides, col="slategray2", pch=19, xlab = "Chlorides")
boxplot(white.wine$free.sulfur.dioxide, col="slategray2", pch=19, xlab = "Free Sulfur Dioxide")
boxplot(white.wine$total.sulfur.dioxide, col="slategray2", pch=19, xlab = "Total Sulfur Dioxide")
boxplot(white.wine$density, col="slategray2", pch=19, xlab = "Density")
boxplot(white.wine$pH, col="slategray2", pch=19, xlab = "pH")
boxplot(white.wine$sulphates, col="slategray2", pch=19, xlab = "Sulphates")
boxplot(white.wine$alcohol, col="slategray2", pch=19, xlab = "Alcohol")
There are a couple of things that we can see from the boxplots above.
New Additional Data Point
Before we begin the data analysis, we will introduce the median point as a new additional data point to the white wine data.
white.datapoints = vector(mode = 'numeric', length = 12)
fixed.cen = (7.30 - mean(white.wine$fixed.acidity))
vol.cen = (0.32 - mean(white.wine$volatile.acidity))
citric.cen = (0.39 - mean(white.wine$citric.acid))
resid.cen = (9.9 - mean(white.wine$residual.sugar))
chlor.cen = (0.05 - mean(white.wine$chlorides))
free.cen = (46 - mean(white.wine$free.sulfur.dioxide))
tot.cen = (167 - mean(white.wine$total.sulfur.dioxide))
dens.cen = (0.9961 - mean(white.wine$density))
ph.cen = (3.28 - mean(white.wine$pH))
sul.cen = (0.55 - mean(white.wine$sulphates))
alc.cen = (11.40 - mean(white.wine$alcohol))
white.datapoints = c(fixed.cen, vol.cen, citric.cen, resid.cen, chlor.cen, free.cen, tot.cen, dens.cen, ph.cen, sul.cen, alc.cen, 6)
white.datapoints
## [1] 0.445212332 0.041758881 0.055808493 3.508585137 0.004227644
## [6] 10.691915067 28.639342589 0.002072624 0.091733361 0.060153124
## [11] 0.885732952 6.000000000
white.wine = rbind(white.wine, white.datapoints)
summary(white.wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 0.4452 Min. :0.04176 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.3000 1st Qu.:0.21000 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.8000 Median :0.26000 Median :0.3200 Median : 5.200
## Mean : 6.8535 Mean :0.27819 Mean :0.3341 Mean : 6.391
## 3rd Qu.: 7.3000 3rd Qu.:0.32000 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.2000 Max. :1.10000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.004228 Min. : 2.0 Min. : 9.0 Min. :0.002073
## 1st Qu.:0.036000 1st Qu.: 23.0 1st Qu.:108.0 1st Qu.:0.991720
## Median :0.043000 Median : 34.0 Median :134.0 Median :0.993740
## Mean :0.045764 Mean : 35.3 Mean :138.3 Mean :0.993825
## 3rd Qu.:0.050000 3rd Qu.: 46.0 3rd Qu.:167.0 3rd Qu.:0.996100
## Max. :0.346000 Max. :289.0 Max. :440.0 Max. :1.038980
## pH sulphates alcohol quality
## Min. :0.09173 Min. :0.06015 Min. : 0.8857 Min. :3.000
## 1st Qu.:3.09000 1st Qu.:0.41000 1st Qu.: 9.5000 1st Qu.:5.000
## Median :3.18000 Median :0.47000 Median :10.4000 Median :6.000
## Mean :3.18764 Mean :0.48976 Mean :10.5123 Mean :5.878
## 3rd Qu.:3.28000 3rd Qu.:0.55000 3rd Qu.:11.4000 3rd Qu.:6.000
## Max. :3.82000 Max. :1.08000 Max. :14.2000 Max. :9.000
dim(white.wine)
## [1] 4899 12
We now introduced the median point as a new point to the white wine quality data.
Pairs Plot
First, with white wine, conduct a pairs plot to see if there are any particular patterns.
pairs(white.wine, main = "Pairs Plot of White Wine")
The pairs plot tells us a bit about the relationship between variables in the dataset. Specifically, we can see that a linear model seems appropriate, although some of the variables seem to have less of a linear relationship (we can look into that by conducting hypothesis tests). Also, the multicollinearity between the variables seem less than the red wine data, since some of the plots show a random scatter. Now, we will conduct a linear regression against the quality of the white wine.
Fitting Into a Linear Model
white.mdl = lm(quality~., data=white.wine)
white.sum = summary(white.mdl)
There are a couple of things that we notice from the summary of the linear model.
Anova Test
In order to see if some of the regressors are insignificant to the regression, we will first run the anova test.
anova(white.mdl)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## fixed.acidity 1 49.23 49.23 86.1807 < 2.2e-16 ***
## volatile.acidity 1 148.85 148.85 260.5770 < 2.2e-16 ***
## citric.acid 1 0.08 0.08 0.1477 0.7008
## residual.sugar 1 21.47 21.47 37.5888 9.421e-10 ***
## chlorides 1 137.76 137.76 241.1641 < 2.2e-16 ***
## free.sulfur.dioxide 1 2.01 2.01 3.5142 0.0609 .
## total.sulfur.dioxide 1 72.93 72.93 127.6766 < 2.2e-16 ***
## density 1 0.44 0.44 0.7631 0.3824
## pH 1 12.40 12.40 21.7057 3.263e-06 ***
## sulphates 1 14.11 14.11 24.7023 6.919e-07 ***
## alcohol 1 590.08 590.08 1032.9822 < 2.2e-16 ***
## Residuals 4887 2791.64 0.57
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The anova test conducts a partial F-test, and supports our hypothesis that citric acid seem to be insiginificant to the linear model given that the variables before them are already included in the model.
To check which of the variables must be selected in building the model, we will conduct a variable selection later on.
Multicollinearity
In the pairs plot above, we suspected that there may be near linear relationship between some of the regressor variables in the data. We will conduct a diagnostics since multicollinearity will cause a serious problem that may dramatically impact the usefulness of the linear model. We will use the variance inflation factor (VIF) to check.
vif(white.mdl)
## fixed.acidity volatile.acidity citric.acid
## 1.403105 1.129854 1.161188
## residual.sugar chlorides free.sulfur.dioxide
## 1.492696 1.206257 1.744774
## total.sulfur.dioxide density pH
## 2.156138 1.259156 1.481981
## sulphates alcohol
## 1.059733 1.638633
Our textbook suggests that VIF larger than 10 will cause serious problems with multicollinearity. Here, VIF for residual sugar and density are larger than 10 and therefore we have a problem. We will look more into this multicollinearity later on.
Residual Analysis
We will first go over the graphic analysis of the residuals to check if the error are i.i.d. normally distributed, with zero mean and constant variance.
plot(white.mdl, which=1)
Comments on the residual vs. fitted plot
plot(white.mdl, which=2)
We observe that the residuals for our model meet the normality assumption. There seems to be a slight negative skew to the distribution of the residuals. Also, there are some potential influential points. We will look at this in a moment.
plot(white.mdl, which=3)
Comments on the Scale-Location Plot
plot(white.mdl, which=5)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Comments on the Residuals vs. Leverage Plot
par(mfrow = c(2,2))
plot(white.mdl, which=1)
plot(white.mdl, which=2)
plot(white.mdl, which=3)
plot(white.mdl, which=5)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Outlier Detection
For the outlier detection, we are going to use externally studentized residuals and plot them against various values.
white.ti = rstudent(white.mdl)
plot(white.mdl$fitted.values, white.ti, xlab = "Fitted Values", ylab="Externally Studentized Residuals", main="Studentized Residuals vs. Fitted Values")
Identifying the points on the plot, points 4899, 4676 and 448 are potential outliers. To investigate the influence of these points on the model, we will obtain an equation with these two observations deleted.
white.wine.del = white.wine[-c(4899,4676,448), ]
white.mdl.del = lm(quality ~ . , data=white.wine.del)
white.sum = summary(white.mdl)
white.sum.del = summary(white.mdl.del)
white.mse = mean(white.sum$residuals^2)
white.mse.del = mean(white.sum.del$residuals^2)
white.sum
##
## Call:
## lm(formula = quality ~ ., data = white.wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9108 -0.4952 -0.0340 0.4665 3.1740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.8639971 0.7562491 7.754 1.08e-14 ***
## fixed.acidity -0.0461155 0.0150720 -3.060 0.00223 **
## volatile.acidity -1.9549206 0.1138344 -17.173 < 2e-16 ***
## citric.acid -0.0274293 0.0961178 -0.285 0.77537
## residual.sugar 0.0271816 0.0026015 10.448 < 2e-16 ***
## chlorides -0.9167898 0.5427403 -1.689 0.09125 .
## free.sulfur.dioxide 0.0047483 0.0008387 5.662 1.58e-08 ***
## total.sulfur.dioxide -0.0008550 0.0003729 -2.293 0.02191 *
## density -3.9026551 0.8366436 -4.665 3.17e-06 ***
## pH 0.1882683 0.0835603 2.253 0.02430 *
## sulphates 0.4247037 0.0972815 4.366 1.29e-05 ***
## alcohol 0.3588470 0.0111651 32.140 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7558 on 4887 degrees of freedom
## Multiple R-squared: 0.2732, Adjusted R-squared: 0.2716
## F-statistic: 167 on 11 and 4887 DF, p-value: < 2.2e-16
white.sum.del
##
## Call:
## lm(formula = quality ~ ., data = white.wine.del)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8365 -0.4935 -0.0380 0.4640 3.1148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.501e+02 1.881e+01 7.980 1.81e-15 ***
## fixed.acidity 6.534e-02 2.088e-02 3.129 0.00177 **
## volatile.acidity -1.864e+00 1.138e-01 -16.378 < 2e-16 ***
## citric.acid 2.247e-02 9.579e-02 0.235 0.81457
## residual.sugar 8.141e-02 7.530e-03 10.810 < 2e-16 ***
## chlorides -2.489e-01 5.467e-01 -0.455 0.64888
## free.sulfur.dioxide 3.737e-03 8.445e-04 4.425 9.84e-06 ***
## total.sulfur.dioxide -2.832e-04 3.782e-04 -0.749 0.45403
## density -1.502e+02 1.908e+01 -7.871 4.29e-15 ***
## pH 6.848e-01 1.054e-01 6.494 9.17e-11 ***
## sulphates 6.314e-01 1.004e-01 6.288 3.50e-10 ***
## alcohol 1.936e-01 2.423e-02 7.991 1.65e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7515 on 4884 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2803
## F-statistic: 174.3 on 11 and 4884 DF, p-value: < 2.2e-16
white.mse
## [1] 0.5698386
white.mse.del
## [1] 0.5633514
Deleting the potential outlier points has a lot of effect on the regression coefficients. In other words, points 2762 and 4746 had a huge impact on the slope and the intercept of the model. In fact, deleting the points increased the \(R^2\) value from 0.2802 to 0.2862, although it is a slight increase. Also, the mean squared error of the model decreases as well. Hence, we conclude that points 2762 and 4746 are influential.
We will use this adjusted model from now on.
However, the improved model still has a low value of \(R^2\). Based on the information we acquired from both the residuals vs. fitted values, we may say that the deficiency of the model is due to trying to fit a discrete quality value to a continuous data.
Outlier Diagnostics : Cook’s Distance
Cook’s distance is one of the ways to measure a point’s influence by considering both the location of a point in the x space and the response variable.
Points with large values of \(D_i\) can be interpreted as an influential point.
white.cooksd = cooks.distance(white.mdl)
white.max.di = max(white.cooksd)
white.max.di
## [1] 3037.942
plot(white.cooksd, pch="*", cex=2, ylab="Cook's Distance", main="Influential Observations by Cook's Distance")
abline(h = 10*mean(white.cooksd, na.rm=T), col="red") # adding the cutoff line
text(x=1:length(white.cooksd)+1, y=white.cooksd, labels=ifelse(white.cooksd>10*mean(white.cooksd, na.rm=T),names(white.cooksd),""), col="red")
When interpreting Cook’s distance, we categorize the values as follows:
Even though the maximum value for the Cook’s distance is 0.02267929, which is way smaller than 0.5, but since it is still larger than other \(D_i\) values, we will take a closer look.
We will fit the model by deleting point 3308.
white.wine.inf = white.wine[-4899, ]
white.mdl.inf = lm(quality ~ . , data=white.wine.inf)
white.sum.inf = summary(white.mdl.inf)
white.mse.inf = mean(white.sum.inf$residuals^2)
white.sum
##
## Call:
## lm(formula = quality ~ ., data = white.wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9108 -0.4952 -0.0340 0.4665 3.1740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.8639971 0.7562491 7.754 1.08e-14 ***
## fixed.acidity -0.0461155 0.0150720 -3.060 0.00223 **
## volatile.acidity -1.9549206 0.1138344 -17.173 < 2e-16 ***
## citric.acid -0.0274293 0.0961178 -0.285 0.77537
## residual.sugar 0.0271816 0.0026015 10.448 < 2e-16 ***
## chlorides -0.9167898 0.5427403 -1.689 0.09125 .
## free.sulfur.dioxide 0.0047483 0.0008387 5.662 1.58e-08 ***
## total.sulfur.dioxide -0.0008550 0.0003729 -2.293 0.02191 *
## density -3.9026551 0.8366436 -4.665 3.17e-06 ***
## pH 0.1882683 0.0835603 2.253 0.02430 *
## sulphates 0.4247037 0.0972815 4.366 1.29e-05 ***
## alcohol 0.3588470 0.0111651 32.140 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7558 on 4887 degrees of freedom
## Multiple R-squared: 0.2732, Adjusted R-squared: 0.2716
## F-statistic: 167 on 11 and 4887 DF, p-value: < 2.2e-16
white.sum.inf
##
## Call:
## lm(formula = quality ~ ., data = white.wine.inf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8348 -0.4934 -0.0379 0.4637 3.1143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.502e+02 1.880e+01 7.987 1.71e-15 ***
## fixed.acidity 6.552e-02 2.087e-02 3.139 0.00171 **
## volatile.acidity -1.863e+00 1.138e-01 -16.373 < 2e-16 ***
## citric.acid 2.209e-02 9.577e-02 0.231 0.81759
## residual.sugar 8.148e-02 7.527e-03 10.825 < 2e-16 ***
## chlorides -2.473e-01 5.465e-01 -0.452 0.65097
## free.sulfur.dioxide 3.733e-03 8.441e-04 4.422 9.99e-06 ***
## total.sulfur.dioxide -2.857e-04 3.781e-04 -0.756 0.44979
## density -1.503e+02 1.907e+01 -7.879 4.04e-15 ***
## pH 6.863e-01 1.054e-01 6.513 8.10e-11 ***
## sulphates 6.315e-01 1.004e-01 6.291 3.44e-10 ***
## alcohol 1.935e-01 2.422e-02 7.988 1.70e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2803
## F-statistic: 174.3 on 11 and 4886 DF, p-value: < 2.2e-16
white.mse
## [1] 0.5698386
white.mse.inf
## [1] 0.5631541
Deleting the point increases the \(R^2\) and the mean squared error value by just a little bit, which is not as desirable. Hence, we conclude that the point 3308 is not influential.
As mentioned above, this may be due to trying to fit a discrete quality value to a continuous data.
Previously, the summary of the full model has told us that our model is not doing a good job in fitting the data. And while analyzing the residuals, we realized the fact that fitting a discrete data into a continuous model may be causing such problem.
Before we go into transforamtion on ordinal variables, we will try some conventional transformations.
Log Transformation
First, let’s try a log transformation to see if it will improve our regression.
white.log = lm(log(quality) ~ ., data=white.wine.inf)
white.log.sum = summary(white.log)
white.sum.inf
##
## Call:
## lm(formula = quality ~ ., data = white.wine.inf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8348 -0.4934 -0.0379 0.4637 3.1143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.502e+02 1.880e+01 7.987 1.71e-15 ***
## fixed.acidity 6.552e-02 2.087e-02 3.139 0.00171 **
## volatile.acidity -1.863e+00 1.138e-01 -16.373 < 2e-16 ***
## citric.acid 2.209e-02 9.577e-02 0.231 0.81759
## residual.sugar 8.148e-02 7.527e-03 10.825 < 2e-16 ***
## chlorides -2.473e-01 5.465e-01 -0.452 0.65097
## free.sulfur.dioxide 3.733e-03 8.441e-04 4.422 9.99e-06 ***
## total.sulfur.dioxide -2.857e-04 3.781e-04 -0.756 0.44979
## density -1.503e+02 1.907e+01 -7.879 4.04e-15 ***
## pH 6.863e-01 1.054e-01 6.513 8.10e-11 ***
## sulphates 6.315e-01 1.004e-01 6.291 3.44e-10 ***
## alcohol 1.935e-01 2.422e-02 7.988 1.70e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2803
## F-statistic: 174.3 on 11 and 4886 DF, p-value: < 2.2e-16
white.log.sum
##
## Call:
## lm(formula = log(quality) ~ ., data = white.wine.inf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81782 -0.08049 0.00273 0.08392 0.51448
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.653e+01 3.281e+00 8.084 7.81e-16 ***
## fixed.acidity 9.151e-03 3.642e-03 2.512 0.012 *
## volatile.acidity -3.491e-01 1.986e-02 -17.579 < 2e-16 ***
## citric.acid 7.227e-03 1.671e-02 0.432 0.665
## residual.sugar 1.412e-02 1.313e-03 10.752 < 2e-16 ***
## chlorides -4.601e-02 9.537e-02 -0.482 0.630
## free.sulfur.dioxide 5.862e-04 1.473e-04 3.980 7.00e-05 ***
## total.sulfur.dioxide -2.687e-05 6.597e-05 -0.407 0.684
## density -2.574e+01 3.328e+00 -7.734 1.26e-14 ***
## pH 1.105e-01 1.839e-02 6.009 2.00e-09 ***
## sulphates 1.102e-01 1.752e-02 6.294 3.37e-10 ***
## alcohol 3.251e-02 4.226e-03 7.692 1.74e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1311 on 4886 degrees of freedom
## Multiple R-squared: 0.2759, Adjusted R-squared: 0.2743
## F-statistic: 169.2 on 11 and 4886 DF, p-value: < 2.2e-16
By taking logarthims of quality values, estimates of each of the coefficients changed quite a lot. However, the log transformation decreased the \(R^2\) value, which is not good.
Since our goal was to improve the model to better fit the data, we will skip further analysis on the log transformation model and move on.
Square-Root Transformation
Next, we will do a square-root transformation to see if it will improve our regression.
white.sqr = lm(sqrt(quality) ~ ., data=white.wine.inf)
white.sqr.sum = summary(white.sqr)
white.sum.inf
##
## Call:
## lm(formula = quality ~ ., data = white.wine.inf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8348 -0.4934 -0.0379 0.4637 3.1143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.502e+02 1.880e+01 7.987 1.71e-15 ***
## fixed.acidity 6.552e-02 2.087e-02 3.139 0.00171 **
## volatile.acidity -1.863e+00 1.138e-01 -16.373 < 2e-16 ***
## citric.acid 2.209e-02 9.577e-02 0.231 0.81759
## residual.sugar 8.148e-02 7.527e-03 10.825 < 2e-16 ***
## chlorides -2.473e-01 5.465e-01 -0.452 0.65097
## free.sulfur.dioxide 3.733e-03 8.441e-04 4.422 9.99e-06 ***
## total.sulfur.dioxide -2.857e-04 3.781e-04 -0.756 0.44979
## density -1.503e+02 1.907e+01 -7.879 4.04e-15 ***
## pH 6.863e-01 1.054e-01 6.513 8.10e-11 ***
## sulphates 6.315e-01 1.004e-01 6.291 3.44e-10 ***
## alcohol 1.935e-01 2.422e-02 7.988 1.70e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2803
## F-statistic: 174.3 on 11 and 4886 DF, p-value: < 2.2e-16
white.sqr.sum
##
## Call:
## lm(formula = sqrt(quality) ~ ., data = white.wine.inf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87914 -0.10005 -0.00190 0.09852 0.60553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.220e+01 3.903e+00 8.250 < 2e-16 ***
## fixed.acidity 1.233e-02 4.333e-03 2.846 0.00445 **
## volatile.acidity -4.023e-01 2.362e-02 -17.031 < 2e-16 ***
## citric.acid 6.448e-03 1.988e-02 0.324 0.74566
## residual.sugar 1.690e-02 1.562e-03 10.817 < 2e-16 ***
## chlorides -5.292e-02 1.134e-01 -0.467 0.64087
## free.sulfur.dioxide 7.411e-04 1.752e-04 4.229 2.39e-05 ***
## total.sulfur.dioxide -4.640e-05 7.847e-05 -0.591 0.55436
## density -3.099e+01 3.959e+00 -7.827 6.08e-15 ***
## pH 1.375e-01 2.187e-02 6.285 3.56e-10 ***
## sulphates 1.316e-01 2.084e-02 6.315 2.95e-10 ***
## alcohol 3.961e-02 5.027e-03 7.879 4.02e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.156 on 4886 degrees of freedom
## Multiple R-squared: 0.2803, Adjusted R-squared: 0.2787
## F-statistic: 173 on 11 and 4886 DF, p-value: < 2.2e-16
By taking the square-root of quality values, estimates of each of the coefficients changed quite a lot. However, the square-root transformation decreased the \(R^2\) value, which is not good.
Since our goal was to improve the model to better fit the data, we will skip further analysis on the square-root transformation model and move on.
Ordinal Variable Regression
Taking the discreteness of the quality values into consideration, we will now look into transforming ordinal variables.
#white.wine.del$quality = factor(white.wine.del$quality)
#white.tran.mdl = polr(quality ~ . , data=white.wine.del, Hess=TRUE)
#white.tran.sum = summary(white.tran.mdl)
Still figuring out how to do this part …
As we saw in the beginning, some of the variables did not seem significant to our model. Since incorrect model specification can cause serious problems, we will try to find the most appropriate subset of regressors for our model.
We will first define all the possible candidates then compare their adjusted \(R^2\) values.
First step in choosing the best set of regressors for our model, we will fit all the regression equations involving one, two regressors, and so on. Then we will select the subset of predictors that do the best at meeting some well-defined objective criterion, such as a large adjusted \(R^2\) value or the small MSE, Mallow’s \(C_p\), or AIC.
white.ols = ols_step_best_subset(white.mdl.inf)
white.ols
## Best Subsets Regression
## --------------------------------------------------------------------------------------------------------------------------------------------------------
## Model Index Predictors
## --------------------------------------------------------------------------------------------------------------------------------------------------------
## 1 alcohol
## 2 volatile.acidity alcohol
## 3 volatile.acidity residual.sugar alcohol
## 4 volatile.acidity residual.sugar free.sulfur.dioxide alcohol
## 5 volatile.acidity residual.sugar density pH alcohol
## 6 volatile.acidity residual.sugar density pH sulphates alcohol
## 7 volatile.acidity residual.sugar free.sulfur.dioxide density pH sulphates alcohol
## 8 fixed.acidity volatile.acidity residual.sugar free.sulfur.dioxide density pH sulphates alcohol
## 9 fixed.acidity volatile.acidity residual.sugar free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 10 fixed.acidity volatile.acidity residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 11 fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## --------------------------------------------------------------------------------------------------------------------------------------------------------
##
## Subsets Regression Summary
## -----------------------------------------------------------------------------------------------------------------------------------------
## Adj. Pred
## Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
## -----------------------------------------------------------------------------------------------------------------------------------------
## 1 0.1897 0.1896 0.1891 618.9350 11684.7824 -2215.6105 11704.2722 3113.5281 0.6359 1e-04 0.8109
## 2 0.2402 0.2399 0.2393 277.3040 11371.5516 -2528.6907 11397.5379 2920.0529 0.5965 1e-04 0.7607
## 3 0.2585 0.2581 0.2573 154.8290 11254.1662 -2645.9902 11286.6491 2850.3214 0.5824 1e-04 0.7427
## 4 0.2640 0.2634 0.2619 119.6254 11219.9116 -2680.2301 11258.8911 2829.8798 0.5784 1e-04 0.7375
## 5 0.2711 0.2703 0.2689 73.4614 11174.5740 -2725.4969 11220.0500 2803.2347 0.5730 1e-04 0.7307
## 6 0.2767 0.2758 0.274 37.4649 11138.9039 -2761.0848 11190.8766 2782.3268 0.5689 1e-04 0.7254
## 7 0.2801 0.2791 0.2766 15.9119 11117.4069 -2782.5146 11175.8762 2769.5776 0.5664 1e-04 0.7222
## 8 0.2818 0.2806 0.2757 6.8056 11108.2878 -2791.5929 11173.2536 2763.8627 0.5653 1e-04 0.7209
## 9 0.2818 0.2805 0.2753 8.2383 11109.7192 -2790.1546 11181.1816 2764.1074 0.5655 1e-04 0.7211
## 10 0.2819 0.2804 0.2751 10.0532 11111.5336 -2788.3345 11189.4926 2764.5684 0.5657 1e-04 0.7214
## 11 0.2819 0.2803 0.2748 12.0000 11113.4803 -2786.3827 11197.9358 2765.1042 0.5659 1e-04 0.7217
## -----------------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria
## SBIC: Sawa's Bayesian Information Criteria
## SBC: Schwarz Bayesian Criteria
## MSEP: Estimated error of prediction, assuming multivariate normality
## FPE: Final Prediction Error
## HSP: Hocking's Sp
## APC: Amemiya Prediction Criteria
plot(white.ols)
best.white = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar + free.sulfur.dioxide + density + pH + sulphates + alcohol, data=white.wine.inf)
best.white.sum = summary(best.white)
Looking at the plots above, the increase in \(R^2\) and Adjusted \(R^2\) almost flattens by model 8. In fact, change in Mallow’s \(C_p\) statistic and AIC drastically decreases at model 8 as well.
Hence, best subset regression tells us that model 8, which has the following variables:
They best meet our criterion, which perfectly aligns with our expectation we had when we saw the large P-values in the full model.
Since comparing all possible regressions and the best subsets take a lot of time and work, we will choose only a few models by adding or deleting regressors one at a time. Here, keep in mind that we used AIC values to compare each steps.
Forward Selection
Start with a model with no regressors included, then we will add regressors as we go on.
start.white.mdl = lm(quality~1, data=white.wine.inf)
scope.white.mdl = lm(quality~., data=white.wine.inf)
step(start.white.mdl, direction = "forward", scope=formula(scope.white.mdl))
## Start: AIC=-1188.69
## quality ~ 1
##
## Df Sum of Sq RSS AIC
## + alcohol 1 728.73 3112.3 -2217.1
## + density 1 362.30 3478.7 -1672.0
## + chlorides 1 169.28 3671.7 -1407.5
## + volatile.acidity 1 145.64 3695.4 -1376.0
## + total.sulfur.dioxide 1 117.28 3723.7 -1338.6
## + fixed.acidity 1 49.62 3791.4 -1250.4
## + pH 1 37.97 3803.0 -1235.3
## + residual.sugar 1 36.57 3804.4 -1233.5
## + sulphates 1 11.07 3829.9 -1200.8
## <none> 3841.0 -1188.7
## + citric.acid 1 0.33 3840.7 -1187.1
## + free.sulfur.dioxide 1 0.26 3840.7 -1187.0
##
## Step: AIC=-2217.14
## quality ~ alcohol
##
## Df Sum of Sq RSS AIC
## + volatile.acidity 1 193.992 2918.3 -2530.4
## + free.sulfur.dioxide 1 56.181 3056.1 -2304.4
## + residual.sugar 1 46.959 3065.3 -2289.6
## + fixed.acidity 1 14.509 3097.8 -2238.0
## + sulphates 1 14.424 3097.8 -2237.9
## + chlorides 1 12.419 3099.8 -2234.7
## + density 1 10.484 3101.8 -2231.7
## + pH 1 8.442 3103.8 -2228.4
## + citric.acid 1 2.184 3110.1 -2218.6
## + total.sulfur.dioxide 1 2.079 3110.2 -2218.4
## <none> 3112.3 -2217.1
##
## Step: AIC=-2530.37
## quality ~ alcohol + volatile.acidity
##
## Df Sum of Sq RSS AIC
## + residual.sugar 1 70.271 2848.0 -2647.8
## + free.sulfur.dioxide 1 40.484 2877.8 -2596.8
## + density 1 25.639 2892.6 -2571.6
## + fixed.acidity 1 16.109 2902.2 -2555.5
## + total.sulfur.dioxide 1 11.165 2907.1 -2547.2
## + sulphates 1 11.006 2907.3 -2546.9
## + pH 1 5.489 2912.8 -2537.6
## + chlorides 1 4.473 2913.8 -2535.9
## <none> 2918.3 -2530.4
## + citric.acid 1 0.301 2918.0 -2528.9
##
## Step: AIC=-2647.76
## quality ~ alcohol + volatile.acidity + residual.sugar
##
## Df Sum of Sq RSS AIC
## + free.sulfur.dioxide 1 21.0028 2827.0 -2682.0
## + density 1 20.8069 2827.2 -2681.7
## + fixed.acidity 1 19.0015 2829.0 -2678.5
## + pH 1 13.5270 2834.5 -2669.1
## + sulphates 1 13.0677 2834.9 -2668.3
## + total.sulfur.dioxide 1 1.9014 2846.1 -2649.0
## + chlorides 1 1.6387 2846.3 -2648.6
## + citric.acid 1 1.5879 2846.4 -2648.5
## <none> 2848.0 -2647.8
##
## Step: AIC=-2682.01
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide
##
## Df Sum of Sq RSS AIC
## + density 1 19.0411 2807.9 -2713.1
## + fixed.acidity 1 15.4963 2811.5 -2706.9
## + pH 1 11.4714 2815.5 -2699.9
## + sulphates 1 11.0877 2815.9 -2699.3
## + total.sulfur.dioxide 1 2.4096 2824.6 -2684.2
## + chlorides 1 2.2177 2824.8 -2683.8
## + citric.acid 1 2.2107 2824.8 -2683.8
## <none> 2827.0 -2682.0
##
## Step: AIC=-2713.11
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide +
## density
##
## Df Sum of Sq RSS AIC
## + pH 1 23.9315 2784.0 -2753.0
## + sulphates 1 22.2233 2785.7 -2750.0
## + fixed.acidity 1 4.1638 2803.8 -2718.4
## + chlorides 1 1.4137 2806.5 -2713.6
## <none> 2807.9 -2713.1
## + citric.acid 1 0.4310 2807.5 -2711.9
## + total.sulfur.dioxide 1 0.0843 2807.9 -2711.3
##
## Step: AIC=-2753.04
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide +
## density + pH
##
## Df Sum of Sq RSS AIC
## + sulphates 1 18.9648 2765.1 -2784.5
## + fixed.acidity 1 2.9318 2781.1 -2756.2
## <none> 2784.0 -2753.0
## + chlorides 1 0.5578 2783.5 -2752.0
## + citric.acid 1 0.2232 2783.8 -2751.4
## + total.sulfur.dioxide 1 0.0992 2783.9 -2751.2
##
## Step: AIC=-2784.51
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide +
## density + pH + sulphates
##
## Df Sum of Sq RSS AIC
## + fixed.acidity 1 6.2700 2758.8 -2793.6
## <none> 2765.1 -2784.5
## + chlorides 1 0.5311 2764.5 -2783.5
## + total.sulfur.dioxide 1 0.4328 2764.6 -2783.3
## + citric.acid 1 0.1415 2764.9 -2782.8
##
## Step: AIC=-2793.63
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide +
## density + pH + sulphates + fixed.acidity
##
## Df Sum of Sq RSS AIC
## <none> 2758.8 -2793.6
## + total.sulfur.dioxide 1 0.32024 2758.5 -2792.2
## + chlorides 1 0.10967 2758.7 -2791.8
## + citric.acid 1 0.01298 2758.8 -2791.7
##
## Call:
## lm(formula = quality ~ alcohol + volatile.acidity + residual.sugar +
## free.sulfur.dioxide + density + pH + sulphates + fixed.acidity,
## data = white.wine.inf)
##
## Coefficients:
## (Intercept) alcohol volatile.acidity
## 1.541e+02 1.932e-01 -1.888e+00
## residual.sugar free.sulfur.dioxide density
## 8.285e-02 3.349e-03 -1.543e+02
## pH sulphates fixed.acidity
## 6.942e-01 6.285e-01 6.810e-02
forward.white = lm(quality ~ alcohol + volatile.acidity + residual.sugar +
free.sulfur.dioxide + density + pH + sulphates + fixed.acidity,
data = white.wine.inf)
forward.white.sum = summary(forward.white)
Our final model has 8 of the 11 possible predictors (written in the order they were added):
Coincidentally, the chosen 8 variables are identical as our choice for best subsets regression.
Backward Elimination
Start with a model with all the regressors included, then we will eliminate the insignificant regressors as we go on.
start.white.mdl2 = lm(quality~., data=white.wine.inf)
step(start.white.mdl2, direction = "backward")
## Start: AIC=-2788.44
## quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar +
## chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
## density + pH + sulphates + alcohol
##
## Df Sum of Sq RSS AIC
## - citric.acid 1 0.030 2758.4 -2790.4
## - chlorides 1 0.116 2758.4 -2790.2
## - total.sulfur.dioxide 1 0.323 2758.7 -2789.9
## <none> 2758.3 -2788.4
## - fixed.acidity 1 5.562 2763.9 -2780.6
## - free.sulfur.dioxide 1 11.039 2769.4 -2770.9
## - sulphates 1 22.339 2780.7 -2750.9
## - pH 1 23.948 2782.3 -2748.1
## - density 1 35.044 2793.4 -2728.6
## - alcohol 1 36.020 2794.3 -2726.9
## - residual.sugar 1 66.152 2824.5 -2674.4
## - volatile.acidity 1 151.345 2909.7 -2528.8
##
## Step: AIC=-2790.39
## quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
## density + pH + sulphates + alcohol
##
## Df Sum of Sq RSS AIC
## - chlorides 1 0.105 2758.5 -2792.2
## - total.sulfur.dioxide 1 0.315 2758.7 -2791.8
## <none> 2758.4 -2790.4
## - fixed.acidity 1 5.749 2764.1 -2782.2
## - free.sulfur.dioxide 1 11.096 2769.4 -2772.7
## - sulphates 1 22.444 2780.8 -2752.7
## - pH 1 23.971 2782.3 -2750.0
## - density 1 35.066 2793.4 -2730.5
## - alcohol 1 36.540 2794.9 -2727.9
## - residual.sugar 1 66.160 2824.5 -2676.3
## - volatile.acidity 1 156.805 2915.2 -2521.6
##
## Step: AIC=-2792.2
## quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## free.sulfur.dioxide + total.sulfur.dioxide + density + pH +
## sulphates + alcohol
##
## Df Sum of Sq RSS AIC
## - total.sulfur.dioxide 1 0.320 2758.8 -2793.6
## <none> 2758.5 -2792.2
## - fixed.acidity 1 6.157 2764.6 -2783.3
## - free.sulfur.dioxide 1 11.036 2769.5 -2774.7
## - sulphates 1 22.570 2781.0 -2754.3
## - pH 1 25.297 2783.8 -2749.5
## - alcohol 1 36.536 2795.0 -2729.8
## - density 1 36.823 2795.3 -2729.2
## - residual.sugar 1 70.134 2828.6 -2671.2
## - volatile.acidity 1 158.543 2917.0 -2520.5
##
## Step: AIC=-2793.63
## quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## free.sulfur.dioxide + density + pH + sulphates + alcohol
##
## Df Sum of Sq RSS AIC
## <none> 2758.8 -2793.6
## - fixed.acidity 1 6.270 2765.1 -2784.5
## - free.sulfur.dioxide 1 13.826 2772.6 -2771.2
## - sulphates 1 22.303 2781.1 -2756.2
## - pH 1 25.460 2784.2 -2750.6
## - alcohol 1 36.300 2795.1 -2731.6
## - density 1 39.920 2798.7 -2725.3
## - residual.sugar 1 72.942 2831.7 -2667.8
## - volatile.acidity 1 167.753 2926.5 -2506.5
##
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## free.sulfur.dioxide + density + pH + sulphates + alcohol,
## data = white.wine.inf)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity
## 1.541e+02 6.810e-02 -1.888e+00
## residual.sugar free.sulfur.dioxide density
## 8.285e-02 3.349e-03 -1.543e+02
## pH sulphates alcohol
## 6.942e-01 6.285e-01 1.932e-01
backward.white = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar +
free.sulfur.dioxide + density + pH + sulphates + alcohol,
data = white.wine.inf)
backward.white.sum = summary(backward.white)
Our final model has 8 of the 11 possible predictors (written in their order in the model):
Again, the chosen 8 variables are identical as our choice for best subsets regression and forward selection.
Stepwise Regression
Repeat forward selection and backward elimination.
step(white.mdl.inf, direction= "both")
## Start: AIC=-2788.44
## quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar +
## chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
## density + pH + sulphates + alcohol
##
## Df Sum of Sq RSS AIC
## - citric.acid 1 0.030 2758.4 -2790.4
## - chlorides 1 0.116 2758.4 -2790.2
## - total.sulfur.dioxide 1 0.323 2758.7 -2789.9
## <none> 2758.3 -2788.4
## - fixed.acidity 1 5.562 2763.9 -2780.6
## - free.sulfur.dioxide 1 11.039 2769.4 -2770.9
## - sulphates 1 22.339 2780.7 -2750.9
## - pH 1 23.948 2782.3 -2748.1
## - density 1 35.044 2793.4 -2728.6
## - alcohol 1 36.020 2794.3 -2726.9
## - residual.sugar 1 66.152 2824.5 -2674.4
## - volatile.acidity 1 151.345 2909.7 -2528.8
##
## Step: AIC=-2790.39
## quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
## density + pH + sulphates + alcohol
##
## Df Sum of Sq RSS AIC
## - chlorides 1 0.105 2758.5 -2792.2
## - total.sulfur.dioxide 1 0.315 2758.7 -2791.8
## <none> 2758.4 -2790.4
## + citric.acid 1 0.030 2758.3 -2788.4
## - fixed.acidity 1 5.749 2764.1 -2782.2
## - free.sulfur.dioxide 1 11.096 2769.4 -2772.7
## - sulphates 1 22.444 2780.8 -2752.7
## - pH 1 23.971 2782.3 -2750.0
## - density 1 35.066 2793.4 -2730.5
## - alcohol 1 36.540 2794.9 -2727.9
## - residual.sugar 1 66.160 2824.5 -2676.3
## - volatile.acidity 1 156.805 2915.2 -2521.6
##
## Step: AIC=-2792.2
## quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## free.sulfur.dioxide + total.sulfur.dioxide + density + pH +
## sulphates + alcohol
##
## Df Sum of Sq RSS AIC
## - total.sulfur.dioxide 1 0.320 2758.8 -2793.6
## <none> 2758.5 -2792.2
## + chlorides 1 0.105 2758.4 -2790.4
## + citric.acid 1 0.019 2758.4 -2790.2
## - fixed.acidity 1 6.157 2764.6 -2783.3
## - free.sulfur.dioxide 1 11.036 2769.5 -2774.7
## - sulphates 1 22.570 2781.0 -2754.3
## - pH 1 25.297 2783.8 -2749.5
## - alcohol 1 36.536 2795.0 -2729.8
## - density 1 36.823 2795.3 -2729.2
## - residual.sugar 1 70.134 2828.6 -2671.2
## - volatile.acidity 1 158.543 2917.0 -2520.5
##
## Step: AIC=-2793.63
## quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## free.sulfur.dioxide + density + pH + sulphates + alcohol
##
## Df Sum of Sq RSS AIC
## <none> 2758.8 -2793.6
## + total.sulfur.dioxide 1 0.320 2758.5 -2792.2
## + chlorides 1 0.110 2758.7 -2791.8
## + citric.acid 1 0.013 2758.8 -2791.7
## - fixed.acidity 1 6.270 2765.1 -2784.5
## - free.sulfur.dioxide 1 13.826 2772.6 -2771.2
## - sulphates 1 22.303 2781.1 -2756.2
## - pH 1 25.460 2784.2 -2750.6
## - alcohol 1 36.300 2795.1 -2731.6
## - density 1 39.920 2798.7 -2725.3
## - residual.sugar 1 72.942 2831.7 -2667.8
## - volatile.acidity 1 167.753 2926.5 -2506.5
##
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## free.sulfur.dioxide + density + pH + sulphates + alcohol,
## data = white.wine.inf)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity
## 1.541e+02 6.810e-02 -1.888e+00
## residual.sugar free.sulfur.dioxide density
## 8.285e-02 3.349e-03 -1.543e+02
## pH sulphates alcohol
## 6.942e-01 6.285e-01 1.932e-01
both.white = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar +
free.sulfur.dioxide + density + pH + sulphates + alcohol,
data = white.wine.inf)
both.white.sum = summary(both.white)
Our final model has 8 of the 11 possible predictors (written in their order in the model):
As it turns out, the chosen 8 variables are identical for all our variable selection methods.
Now, we will compare all our models by their adjusted \(R^2\).
# Adjusted R-squared for the full model
white.rsq = white.sum.inf$adj.r.squared
white.rsq
## [1] 0.2802536
# Adjusted R-squared for best subset regression model
best.white.rsq = best.white.sum$adj.r.squared
best.white.rsq
## [1] 0.2805767
# Adjusted R-squared for forward selection model
forward.white.rsq = forward.white.sum$adj.r.squared
forward.white.rsq
## [1] 0.2805767
# Adjusted R-squared for backward elimination model
backward.white.rsq = backward.white.sum$adj.r.squared
backward.white.rsq
## [1] 0.2805767
# Adjusted R-squared for stepwise regression model
both.white.rsq = both.white.sum$adj.r.squared
both.white.rsq
## [1] 0.2805767
Clearly, even after selecting the appropriate regressors, the adjusted \(R^2\) did not improve. In other words, the subset models do not show much of a higher performance. In fact, as we saw above, all the variable selection models choose the same 8 variables and even generates equal adjusted \(R^2\) values.
Hence, from now on, the best model used will be
Quality = \(\beta_0\) + \(\beta_1\) * fixed acidity + \(\beta_2\) * volatile acidity + \(\beta_3\) * residual sugar + \(\beta_4\) * free sulfur dioxide + \(\beta_5\) * density + \(\beta_6\) * pH + \(\beta_7\) * sulphates + \(\beta_8\) * alcohol.
However, although these are all the variable selection method we learned, we should always keep in mind that there will be several equally good models. That is, we might be ignorant of the background knowledge of the collected data, i.e. there may be a whole other variable that affects wine quality.
With that in mind, we will assess our final model by investigating the residual plots.
fin.white = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar +
free.sulfur.dioxide + density + pH + sulphates + alcohol,
data = white.wine.inf)
plot(fin.white, which=1)
Comments on the residual vs. fitted plot
plot(fin.white, which=2)
We observe that the residuals for our model meet the normality assumption. The negative skew that we saw in the full model has reduced, and we cannot really spot potential influential points, which is a good thing.
plot(fin.white, which=3)
Comments on the Scale-Location Plot
par(mfrow = c(2,2))
plot(fin.white, which=1)
plot(fin.white, which=2)
plot(fin.white, which=3)
plot(fin.white, which=5)
Comments on the Residuals vs. Leverage Plot
par(mfrow = c(2,2))
plot(fin.white, which=1)
plot(fin.white, which=2)
plot(fin.white, which=3)
plot(fin.white, which=5)
white.cooksd2 = cooks.distance(fin.white)
white.max.di2 = max(white.cooksd2)
white.max.di2
## [1] 1.296286
plot(white.cooksd2, pch="*", cex=2, ylab="Cook's Distance", main="Influential Observations by Cook's Distance for White Wine")
abline(h = 10*mean(white.cooksd2, na.rm=T), col="red") # adding the cutoff line
text(x=1:length(white.cooksd2)+1, y=white.cooksd2, labels=ifelse(white.cooksd2>10*mean(white.cooksd2, na.rm=T),names(white.cooksd2),""), col="red")
## Model Validation
Before we jump to any conclusions, we should check the validity of our model. There is a good chance that this data was made so that one can predict the quality of a wine based on specific chemical attributes.
We will perform cross validation in order to see how well our model predicts the quality of white wine. We do this by dividing the dataset in such a way that 80 percent of the dataset is part of the training set and 20 percent of the dataset is the testing set.
Then we will compare the actual value to the predicted value. We repeat the steps of validation 5 times to check the values of R squared, root mean square error and mean absolute error, which will tell us how the model behaves.
Cross Validation - Full Model
First, we will see the prediction power of our full model with all our regressors.
set.seed(71168)
sample.white.n = ceiling(0.8*length(white.wine.inf$quality))
for(i in 1:5){
train.white.sample = sample(c(1:length(white.wine.inf$quality)),sample.white.n)
train.white.sample = sort(train.white.sample)
train.white.data = white.wine.inf[train.white.sample,]
test.white.data = white.wine.inf[-train.white.sample,]
train.white.mdl = lm(train.white.data$quality~., data = train.white.data)
summary(train.white.mdl)
preds.white = predict(train.white.mdl,test.white.data)
plot(test.white.data$quality,preds.white,xlim=c(4,10),ylim=c(4,10))
abline(c(0,1))
# R-squared
R.sq = R2(preds.white, test.white.data$quality)
#RMSPE
RMSPE = RMSE(preds.white, test.white.data$quality)
#MAPE
MAPE = MAE(preds.white, test.white.data$quality)
print(c(i,R.sq,RMSPE,MAPE))
}
## [1] 1.0000000 0.2780152 0.7504382 0.5866302
## [1] 2.0000000 0.2273238 0.7712567 0.6016954
## [1] 3.0000000 0.3076548 0.7370168 0.5790178
## [1] 4.0000000 0.2858487 0.7760121 0.5936838
## [1] 5.0000000 0.2730520 0.7367243 0.5665545
Here are the things that we notice from our numerical output.
Now, here are the things that we see from our graphical observations.
Cross Validation - Final Model
Next, let’s take a look at the prediction power of our final model with selected regressors.
set.seed(71168)
sample.white.n = ceiling(0.8*length(white.wine.inf$quality))
par(mfrow = c(3,3))
for(i in 1:5){
train.white.sample = sample(c(1:length(white.wine.inf$quality)),sample.white.n)
train.white.sample = sort(train.white.sample)
train.white.data = white.wine.inf[train.white.sample,]
test.white.data = white.wine.inf[-train.white.sample,]
train.white.fin = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar +
free.sulfur.dioxide + density + pH + sulphates + alcohol, data = train.white.data)
summary(train.white.fin)
preds.white.fin = predict(train.white.fin,test.white.data)
plot(test.white.data$quality,preds.white.fin,xlim=c(4,10),ylim=c(4,10))
abline(c(0,1))
# R-squared
R.sq = R2(preds.white.fin, test.white.data$quality)
#RMSPE
RMSPE = RMSE(preds.white.fin, test.white.data$quality)
#MAPE
MAPE = MAE(preds.white.fin, test.white.data$quality)
print(c(i,R.sq,RMSPE,MAPE))
}
## [1] 1.0000000 0.2786311 0.7501360 0.5865241
## [1] 2.0000000 0.2284019 0.7706200 0.6016314
## [1] 3.0000000 0.3102651 0.7356908 0.5786415
## [1] 4.0000000 0.2855969 0.7761478 0.5938715
## [1] 5.0000000 0.2738739 0.7362855 0.5667910
Both the numerical and graphical observations do not have a lot of difference from our full model.
Hence we conclude that both our models have no prediction powers.
This is where conclusion will take place.
Here are the works cited.
Montgomery, D. C., Peck, E. A., & Vining, G. G. (2012). Introduction to Linear Regression Analysis - 5th ed. Hoboken, New Jersey: John Wiley & Sons, Inc.
P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.